Learning Goals

Lab Description

We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.

Steps

0. Install and load libraries

1. Read in the data

  • There are 4 datasets to read in, Starbucks locations, Starbucks nutrition, US population by state, and US state abbreviations.
sb_locs <- read_csv("starbucks-locations.csv", show_col_types = FALSE)

sb_nutr <- read_csv("starbucks-menu-nutrition.csv", show_col_types = FALSE)

usa_pop <- read_csv("us_state_pop.csv", show_col_types = FALSE)

usa_states <- read_csv("states.csv", show_col_types = FALSE)

2. Look at the data

  • Inspect each dataset to look at variable names and ensure it was imported correctly
# Set eval=FALSE
View(sb_locs)
View(sb_nutr)
View(usa_pop)
View(usa_states)

3. Format the data

  • Subset Starbucks data to the US.
  • Create counts of Starbucks stores by state.
  • Merge population in with the store count by state.
  • Inspect the range values for each variable.
sb_usa <- sb_locs |> filter(Country == "US")

sb_locs_state <- sb_usa |>
  group_by(`State/Province`) |>
  rename(state_abbr = `State/Province`) |>
  summarize(n_stores = n())

# need state abbreviations
usa_pop_abbr <- full_join(
  sb_locs_state,
  usa_states,
  by = join_by(state_abbr == Abbreviation)
)

sb_locs_state <- full_join(
  usa_pop_abbr,
  usa_pop,
  by = join_by(State == state)
)

summary(sb_locs_state)
##   state_abbr           n_stores         State             population      
##  Length:55          Min.   :   8.0   Length:55          Min.   :   56882  
##  Class :character   1st Qu.:  56.5   Class :character   1st Qu.: 1344331  
##  Mode  :character   Median : 123.0   Mode  :character   Median : 3751351  
##                     Mean   : 266.8                      Mean   : 5677621  
##                     3rd Qu.: 332.0                      3rd Qu.: 6515716  
##                     Max.   :2821.0                      Max.   :37253956  
##                     NA's   :4

4. Use ggplotly for EDA

Answer the following questions:

  • Are the number of Starbucks proportional to the population of a state? (scatterplot)

  • Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)

  • What are the top 20 words in Starbucks menu items? (bar plot)

p1 <- sb_locs_state |>
  ggplot(aes(x = n_stores, y = population, color = state_abbr)) +
  geom_point()
ggplotly(p1)
p2 <- sb_nutr |>
  ggplot(aes(x = Calories, fill = Category)) +
  geom_histogram(alpha = 0.5, position = "identity")
ggplotly(p2)
p3 <- sb_nutr |>
  unnest_tokens(word, Item) |>
  count(word, sort = TRUE) |>
  head(20) |>
  ggplot(aes(x = word, y = n, fill = word)) +
  geom_col() +
  coord_flip()

ggplotly(p3) |> layout(showlegend = FALSE)

5. Scatterplots using plot_ly()

  • Create a scatterplot using plot_ly() representing the relationship between calories and carbs
  • Color points by category
sb_nutr |>
  plot_ly(
    x = ~Calories,
    y = ~`Carb. (g)`,
    type = "scatter",
    mode = "markers",
    color = ~Category
  )
  • Create this scatterplot but for the items consisting of the top 10 words
  • Color again by category
  • Add hoverinfo specifying the word in the item name
  • Add layout information to title the chart and the axes
  • Enable hovermode = "compare"
topwords <- sb_nutr |>
  unnest_tokens(word, Item) |>
  count(word, sort = TRUE) |>
  head(10)

sb_nutr |>
  unnest_tokens(word, Item) |>
  filter(word %in% topwords$word) |>
  plot_ly(
    x = ~Calories,
    y = ~`Carb. (g)`,
    type = "scatter",
    mode = "markers",
    color = ~Category,
    hoverinfo = "text",
    hovertext = ~ paste0(
      "Item: ", word, "<br>",
      "Calories: ", Calories, "<br>",
      "Carb. (g): ", `Carb. (g)`
    )
  ) |>
  layout(
    xaxis = list(title = "Calories"),
    yaxis = list(title = "Carbohydrates (g)"),
    title = "Carbohydrates vs. Calories for the Top 10 Item Words",
    hovermode = "compare"
  )

6. plot_ly Boxplots

  • Create a boxplot of all of the nutritional variables in groups by the 10 item words.
filtered_data <- sb_nutr |>
  unnest_tokens(word, Item) |>
  filter(word %in% topwords$word)
filtered_data |>
  plot_ly(x = ~word, y = ~Calories, type = "box", name = "Calories") |>
  add_boxplot(y = ~`Fat (g)`, name = "Fat"
  ) |>
  add_boxplot(y = ~`Carb. (g)`, name = "Carbohydrates") |>
  add_boxplot(y = ~`Fiber (g)`, name = "Fiber") |>
  add_boxplot(y = ~`Protein (g)`, name = "Protein") |>
  layout(
    xaxis = list(title = "Item"),
    yaxis = list(title = "Nutritional Variables"),
    boxmode = "group"
  )

7. 3D Scatterplot

  • Create a 3D scatterplot between Calories, Carbs, and Protein for the items containing the top 10 words
  • Do you see any patterns?
filtered_data |>
  plot_ly(
    x = ~Calories,
    y = ~`Carb. (g)`,
    z = ~`Protein (g)`,
    type = "scatter3d",
    mode = "markers",
    color = ~word
  )

8. plot_ly Map

  • Create a map to visualize the number of stores per state, and another for the population by state. Use subplot to put the maps side by side.
  • Describe the differences if any.
# Set up mapping details
set_map_details <- list(
  scope = "usa",
  projection = list(type = "albers usa"),
  showlakes = TRUE,
  lakecolor = toRGB("steelblue")
)

# Make sure both maps are on the same color scale
shade_limit <- 125

# Create hover text
sb_locs_state$hover <- with(
  sb_locs_state,
  paste(
    "Number of Starbucks: ", n_stores, "<br>",
    "State: ", State, "<br>",
    "Population: ", population
  )
)
# Create the map
map1 <- plot_geo(sb_locs_state, locationmode = "USA-states") |>
  add_trace(
    z = ~n_stores,
    text = ~hover,
    locations = ~state_abbr,
    color = ~n_stores,
    colors = "Reds"
  ) |>
  layout(title = "Starbucks Stores by State", geo = set_map_details)


map2 <- plot_geo(sb_locs_state, locationmode = "USA-states") |>
  add_trace(
    z = ~population,
    text = ~hover,
    locations = ~state_abbr,
    color = ~population,
    colors = "Greens"
  ) |>
  layout(title = "Population by State", geo = set_map_details)

subplot(map1, map2)

We note that there is some association between the number of Starbucks stores and population. For example, California has the most population, and it indeed has the most Starbucks stores. Texas, Washington, New York and Florida have relatively high population and they also have more Starbucks stores than other states.